Libraries

library(data.table)
library(magrittr)
library(ggplot2)
library(dplyr)
library(stringr)
library(ggplot2)
library(gridExtra)
library(ggExtra)
library(GGally)
library(caret)
library(glmnet)
library(geosphere)
library(InformationValue)
library(ggcorrplot)

Get data

teams <- read.csv("Data/Teams.csv")

seasons <- read.csv("Data/Seasons.csv")

seeds <- read.csv("Data/NCAATourneySeeds.csv")

conferences <- read.csv("Data/Conferences.csv")

coaches <- read.csv("Data/TeamCoaches.csv")

team_conferences <- read.csv("Data/TeamConferences.csv")
  
tour_compact_res <- read.csv("Data/NCAATourneyCompactResults.csv")

tour_detail_res <- read.csv("Data/NCAATourneyDetailedResults.csv")

seas_compact_res <- read.csv("Data/RegularSeasonCompactResults.csv")

seas_detail_res <- read.csv("Data/RegularSeasonDetailedResults.csv")

new_master_file <- read.csv("Data/df11.csv")

Data Exploration

View each table to understand the variables

head(teams,n=5)
##   TeamID    TeamName FirstD1Season LastD1Season
## 1   1101 Abilene Chr          2014         2018
## 2   1102   Air Force          1985         2018
## 3   1103       Akron          1985         2018
## 4   1104     Alabama          1985         2018
## 5   1105 Alabama A&M          2000         2018
head(seasons,n=5)
##   Season    DayZero RegionW   RegionX   RegionY   RegionZ
## 1   1985 10/29/1984    East      West   Midwest Southeast
## 2   1986 10/28/1985    East   Midwest Southeast      West
## 3   1987 10/27/1986    East Southeast   Midwest      West
## 4   1988  11/2/1987    East   Midwest Southeast      West
## 5   1989 10/31/1988    East      West   Midwest Southeast
head(seeds,n=5)
##   Season Seed TeamID
## 1   1985  W01   1207
## 2   1985  W02   1210
## 3   1985  W03   1228
## 4   1985  W04   1260
## 5   1985  W05   1374
head(conferences,n=5)
##   ConfAbbrev                  Description
## 1      a_sun      Atlantic Sun Conference
## 2      a_ten       Atlantic 10 Conference
## 3        aac American Athletic Conference
## 4        acc    Atlantic Coast Conference
## 5        aec      America East Conference
head(coaches,n=5)
##   Season TeamID FirstDayNum LastDayNum      CoachName
## 1   1985   1102           0        154  reggie_minton
## 2   1985   1103           0        154    bob_huggins
## 3   1985   1104           0        154 wimp_sanderson
## 4   1985   1106           0        154   james_oliver
## 5   1985   1108           0        154  davey_whitney
head(team_conferences,n=5)
##   Season TeamID ConfAbbrev
## 1   1985   1114      a_sun
## 2   1985   1147      a_sun
## 3   1985   1204      a_sun
## 4   1985   1209      a_sun
## 5   1985   1215      a_sun
head(tour_compact_res,n=5)
##   Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT
## 1   1985    136    1116     63    1234     54    N     0
## 2   1985    136    1120     59    1345     58    N     0
## 3   1985    136    1207     68    1250     43    N     0
## 4   1985    136    1229     58    1425     55    N     0
## 5   1985    136    1242     49    1325     38    N     0
head(tour_detail_res,n=5)
##   Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT WFGM WFGA WFGM3
## 1   2003    134    1421     92    1411     84    N     1   32   69    11
## 2   2003    136    1112     80    1436     51    N     0   31   66     7
## 3   2003    136    1113     84    1272     71    N     0   31   59     6
## 4   2003    136    1141     79    1166     73    N     0   29   53     3
## 5   2003    136    1143     76    1301     74    N     1   27   64     7
##   WFGA3 WFTM WFTA WOR WDR WAst WTO WStl WBlk WPF LFGM LFGA LFGM3 LFGA3
## 1    29   17   26  14  30   17  12    5    3  22   29   67    12    31
## 2    23   11   14  11  36   22  16   10    7   8   20   64     4    16
## 3    14   16   22  10  27   18   9    7    4  19   25   69     7    28
## 4     7   18   25  11  20   15  18   13    1  19   27   60     7    17
## 5    20   15   23  18  20   17  13    8    2  14   25   56     9    21
##   LFTM LFTA LOR LDR LAst LTO LStl LBlk LPF
## 1   14   31  17  28   16  15    5    0  22
## 2    7    7   8  26   12  17   10    3  15
## 3   14   21  20  22   11  12    2    5  18
## 4   12   17  14  17   20  21    6    6  21
## 5   15   20  10  26   16  14    5    8  19
head(seas_compact_res,n=5)
##   Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT
## 1   1985     20    1228     81    1328     64    N     0
## 2   1985     25    1106     77    1354     70    H     0
## 3   1985     25    1112     63    1223     56    H     0
## 4   1985     25    1165     70    1432     54    H     0
## 5   1985     25    1192     86    1447     74    H     0
head(seas_detail_res,n=5)
##   Season DayNum WTeamID WScore LTeamID LScore WLoc NumOT WFGM WFGA WFGM3
## 1   2003     10    1104     68    1328     62    N     0   27   58     3
## 2   2003     10    1272     70    1393     63    N     0   26   62     8
## 3   2003     11    1266     73    1437     61    N     0   24   58     8
## 4   2003     11    1296     56    1457     50    N     0   18   38     3
## 5   2003     11    1400     77    1208     71    N     0   30   61     6
##   WFGA3 WFTM WFTA WOR WDR WAst WTO WStl WBlk WPF LFGM LFGA LFGM3 LFGA3
## 1    14   11   18  14  24   13  23    7    1  22   22   53     2    10
## 2    20   10   19  15  28   16  13    4    4  18   24   67     6    24
## 3    18   17   29  17  26   15  10    5    2  25   22   73     3    26
## 4     9   17   31   6  19   11  12   14    2  18   18   49     6    22
## 5    14   11   13  17  22   12  14    4    4  20   24   62     6    16
##   LFTM LFTA LOR LDR LAst LTO LStl LBlk LPF
## 1   16   22  10  22    8  18    9    2  20
## 2    9   20  20  25    7  12    8    6  16
## 3   14   23  31  22    9  12    2    5  23
## 4    8   15  17  20    9  19    4    3  23
## 5   17   27  21  15   12  10    7    1  14
head(new_master_file,n=5)
##   X.1 X Season DayNum WScore LScore WLoc NumOT WFGM WFGA WFGM3 WFGA3 WFTM
## 1   1 1   2003    134     92     84    N     1   32   69    11    29   17
## 2   2 2   2003    136     80     51    N     0   31   66     7    23   11
## 3   3 3   2003    136     84     71    N     0   31   59     6    14   16
## 4   4 4   2003    136     79     73    N     0   29   53     3     7   18
## 5   5 5   2003    136     76     74    N     1   27   64     7    20   15
##   WFTA WOR WDR WAst WTO WStl WBlk WPF LFGM LFGA LFGM3 LFGA3 LFTM LFTA LOR
## 1   26  14  30   17  12    5    3  22   29   67    12    31   14   31  17
## 2   14  11  36   22  16   10    7   8   20   64     4    16    7    7   8
## 3   22  10  27   18   9    7    4  19   25   69     7    28   14   21  20
## 4   25  11  20   15  18   13    1  19   27   60     7    17   12   17  14
## 5   23  18  20   17  13    8    2  14   25   56     9    21   15   20  10
##   LDR LAst LTO LStl LBlk LPF TeamID.Type TeamID Region Play_In Seeding
## 1  28   16  15    5    0  22     WTeamID   1421      X       1      16
## 2  26   12  17   10    3  15     WTeamID   1112      Z       0       1
## 3  22   11  12    2    5  18     WTeamID   1113      Z       0      10
## 4  17   20  21    6    6  21     WTeamID   1141      Z       0      11
## 5  26   16  14    5    8  19     WTeamID   1143      W       0       8
##   CRType CityID City State ConfAbbrev                      Home_Address
## 1   <NA>     NA <NA>  <NA>  big_south       Asheville, NC, Kimmel Arena
## 2   <NA>     NA <NA>  <NA>    pac_ten         Tucson, AZ, McKale Center
## 3   <NA>     NA <NA>  <NA>    pac_ten      Tempe, AZ, Wells Fargo Arena
## 4   <NA>     NA <NA>  <NA>        mac Mount Pleasant, MI, McGuirk Arena
## 5   <NA>     NA <NA>  <NA>    pac_ten       Berkeley, CA, Haas Pavilion
##          Home_Arena      Home_City Home_State..Abrv     Home_State
## 1      Kimmel Arena      Asheville               NC North Carolina
## 2     McKale Center         Tucson               AZ        Arizona
## 3 Wells Fargo Arena          Tempe               AZ        Arizona
## 4     McGuirk Arena Mount Pleasant               MI       Michigan
## 5     Haas Pavilion       Berkeley               CA     California
##          Home_Team Home_Conference Home_Capacity Home_Opened   Home_Lat
## 1    UNC Asheville       Big South         3,200        2011  -82.56731
## 2          Arizona          Pac-12        14,545        1973 -110.94607
## 3    Arizona State          Pac-12        10,754        1974 -111.93101
## 4 Central Michigan             MAC         5,300        1973  -84.77406
## 5       California          Pac-12        11,877        1933 -122.26231
##   Home_Lon  Home_Team_Match Tourney_Address Tourney_Lat Tourney_Lon
## 1 35.61629    unc asheville            <NA>   -92.25413    38.12314
## 2 32.23030          arizona            <NA>   -92.25413    38.12314
## 3 33.42450    arizona state            <NA>   -92.25413    38.12314
## 4 43.58149 central michigan            <NA>   -92.25413    38.12314
## 5 37.86939       california            <NA>   -92.25413    38.12314
##   Tourney_Round Tourney_State Tourney_Stadium  Distance
## 1          <NA>          <NA>            <NA>  906151.9
## 2          <NA>          <NA>            <NA> 1818807.5
## 3          <NA>          <NA>            <NA> 1848419.1
## 4          <NA>          <NA>            <NA>  874461.0
## 5          <NA>          <NA>            <NA> 2621042.7

Data Preparing for Visualization and Modeling

We are interested to train a model to predict tournment win/lose and randomize the winner and loser team into team 1 and 2 and we calculate the probability of team 1 wins. We randomize winning and losing team into team 1 and team 2 (necessary for probabilities later) and drop other ids

rand_tourn_res <- tour_compact_res %>% select(Season,DayNum,WTeamID,LTeamID) %>% mutate(rand = runif(dim(tour_compact_res)[1]),
         team1id = ifelse(rand >= 0.5, WTeamID, LTeamID),
         team2id = ifelse(rand <0.5, WTeamID, LTeamID),
         team1win = ifelse(team1id == WTeamID, 1, 0)) %>%
  select(-rand, -WTeamID,-LTeamID)

# rand_tourn_res <- new_master_file %>% select(Season,DayNum,WTeamID,LTeamID,Distance) %>% mutate(rand = runif(dim(tour_compact_res)[1]),
#          team1id = ifelse(rand >= 0.5, WTeamID, LTeamID),
#          team2id = ifelse(rand <0.5, WTeamID, LTeamID),
#          team1win = ifelse(team1id == WTeamID, 1, 0)) %>%
#   select(-rand, -WTeamID,-LTeamID)

Then we add seeding information to games as seeding can be an important predictor

# We remove letters from seeds variable and only retain numeric values
seeds_tournment <- seeds %>% mutate(ranking = as.factor((str_replace(Seed, "[A-Z]",""))), 
         rank_num = as.numeric(str_replace(ranking, ".[a-z]","")))

# Join seeds with the tournament results table by teamid and season for team1
rand_tourn_res <- rand_tourn_res %>% 
  left_join(
    select(seeds_tournment, t1_rank = ranking, t1_rank_n = rank_num, TeamID, Season), 
    by = c("team1id"="TeamID","Season"="Season")) 

# Join seeds with the tournament results table by teamid and season for team2
rand_tourn_res <- rand_tourn_res %>% 
  left_join(
    select(seeds_tournment, t2_rank = ranking, t2_rank_n = rank_num, TeamID, Season), 
    by = c("team2id"="TeamID","Season"="Season")) 

# There are some team has 'NA' seeds and we replac the 'NA' with the average seeds number 8
rand_tourn_res <- rand_tourn_res %>% mutate(t1_rank = ifelse(is.na(t1_rank), 8, t1_rank),
                                                    t2_rank = ifelse(is.na(t2_rank), 8, t2_rank),
                                                    t1_rank_n = ifelse(is.na(t1_rank_n), 8, t1_rank_n),
                                                    t2_rank_n = ifelse(is.na(t2_rank_n), 8, t2_rank_n),
                                                    diff_rank = t1_rank_n - t2_rank_n)

Besides seeding information, we are also interested in knowing how certain regular season statistics correlate with winning vs losing.We take the regular season detail and stack it vertically with only 1 column of TeamIDs and a factor indicating whether that row corresponds to a win or a loss.

# Select winning teams variables from seasons detailed result table
win_predictors <- seas_detail_res %>% select(Season,starts_with("W"))
# Create a variable called Res to store the game result, for winning team, the value is 1
win_predictors$Res = 1
# Remove the 'W' initial letter from the column names
names(win_predictors) = substring(names(win_predictors),2)
# Remove Loc variable from the predictors, since Loc variable is only available for winning team,
# and if we stack without removing this var with losing team stats, it generated error of unmatched #dimnesion
win_predictors <- win_predictors%>% select(-Loc)

# Select losing teams variables from seasons detailed result table
lose_predictors <- seas_detail_res %>% select(Season,starts_with("L"))
# Create a variable called Res to store the game result, for losing team, the value is 0
lose_predictors$Res = 0
# Remove the 'L' initial letter from the column names
names(lose_predictors) = substring(names(lose_predictors),2)

# Stack using row binding function winning and losing stats
predictors_all <- rbindlist(list(win_predictors, lose_predictors))
# Correct names for columns Season and Res
predictors_all <- predictors_all %>% rename(Season = eason, Res = es)
#Here we also add some additional game statistcs. These include field goal percentage, free throw percentage.
predictors_all <- predictors_all %>% mutate(FGP = FGM/FGA,FGP2 = (FGM - FGM3) / (FGA - FGA3),FGP3 = FGM3 / FGA3,FTP = FTM / FTA)

# Make Res column a binary category var, 'W' stands for winning, 'L' stands for losing
predictors_all <- predictors_all%>%mutate(Res = factor(ifelse(Res == 1, 'W','L')))
# Create Outcome column still representing game outcome with 1/0 numeric value
predictors_all <- predictors_all%>%mutate(Outcome = ifelse(Res == 'W', 1,0))

Create mean stats for each season, each team using group by and summarise_all function

mean_predictors <- predictors_all %>% group_by(Season,TeamID) %>% summarise_all(funs(mean))

Join the mean stats with the tournament results dataframe, and the constructed model_df is our main dataframe which will be used in model building, note that now that we haven’t added the game venue to each team’s home stadium distance variable –TODO

# Join the team1 stats
model_df <- rand_tourn_res %>%  inner_join(
    mean_predictors,
    by = c("team1id"="TeamID","Season"="Season")) 

# Join the team2 stats
model_df <- model_df %>%  inner_join(
    mean_predictors,
    by = c("team2id"="TeamID","Season"="Season")) 

# Rename the stats columns with extension _t1 and _t2 to stand for team1 and team2's stats
names(model_df) <- gsub('.x','_t1',names(model_df))
names(model_df) <- gsub('.y','_t2',names(model_df))

Add distance metrics to model_df by merging

model_df <- merge(x=model_df,y=new_master_file%>%select('Season','DayNum','TeamID','Distance'),by.x = c('Season','D_t2Num','team1id'),by.y=c('Season','DayNum','TeamID'),all.x=TRUE)

model_df <- merge(x=model_df,y=new_master_file%>%select('Season','DayNum','TeamID','Distance'),by.x = c('Season','D_t2Num','team2id'),by.y=c('Season','DayNum','TeamID'),all.x=TRUE)

# Rename the stats columns with extension _t1 and _t2 to stand for team1 and team2's stats
names(model_df) <- gsub('.x','_t1',names(model_df))
names(model_df) <- gsub('.y','_t2',names(model_df))

Visualization plots

model_df$team1win <- as.factor(model_df$team1win)
ggplot(model_df,aes(x=team1win, y=Distance_t1,fill=team1win))+geom_boxplot()

ggscatmat(predictors_all,columns = 3:5,color = 'Res')

ggscatmat(predictors_all,columns = 6:8,color = 'Res')

ggscatmat(predictors_all,columns = 9:10,color = 'Res')

ggscatmat(predictors_all,columns = 11:13,color = 'Res')

ggscatmat(predictors_all,columns = 14:16,color = 'Res')

ggscatmat(predictors_all,columns = 17:19,color = 'Res')
## Warning in ggscatmat(predictors_all, columns = 17:19, color = "Res"):
## Factor variables are omitted in plot

ggscatmat(predictors_all,columns = 20:21,color = 'Res')
## Warning: Removed 32 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing missing values (geom_text).

Correlation plot

# Correlation plot
corr <- round(cor(predictors_all%>%select(-c(Res,FTP))),1)

ggcorrplot(corr, hc.order = TRUE,
           type = "lower",
           lab = TRUE,
           lab_size = 3,
           method="circle",
           colors = c("tomato2", "white", "springgreen3"),
           title="Correlogram of mtcars",
           ggtheme=theme_bw)

Scatter plot with outcome relationship

# Scatter plot
theme_set(theme_bw())
g <- ggplot(predictors_all, aes(Outcome, FGP)) +
  geom_count() +
  geom_smooth(method="lm", se=F)

ggMarginal(g, type = "histogram", fill="transparent")
ggMarginal(g, type = "boxplot", fill="transparent")

g

Historic performances

# Bars
teams <- data.table(teams)
seeds <- data.table(seeds)
seas_compact_res <- data.table(seas_compact_res)
tour_compact_res <- data.table(tour_compact_res)

setkey(teams, TeamID)
setkey(seeds, TeamID)


g1 <-
    teams[seeds][, one_seed := as.numeric(substr(Seed, 2, 3)) == 1][, sum(one_seed), by = TeamName][order(V1, decreasing = T)][1:20,] %>%
    ggplot(aes(x = reorder(TeamName, V1), y = V1,label = V1)) + geom_text(color="white", size=2) +
    geom_point(stat = 'identity', fill = 'darkblue',size = 4) + geom_segment(aes(y=0,x = reorder(TeamName, V1), yend = V1,xend =reorder(TeamName, V1) ),color = 'black') +
    labs(x = '', y = 'No 1 seeds', title = 'No. 1 Seeds since 1985') +
    coord_flip()

setkey(seas_compact_res,WTeamID)
g2 <-
    seas_compact_res[teams][, .(wins = .N), by = TeamName][order(-wins)][1:20,] %>%
    ggplot(aes(x = reorder(TeamName, wins), y = wins,label = wins)) + geom_text(color="white", size=2) +
    geom_point(stat = 'identity', fill = 'darkblue',size = 4) + geom_segment(aes(y=0,x = reorder(TeamName,wins), yend = wins,xend =reorder(TeamName, wins) ),color = 'black') +
    labs(x = '', y = 'Wins', title = 'Regular Season Wins since 1985') +
    coord_flip()

setkey(tour_compact_res, WTeamID)

g3 <-
    tour_compact_res[teams][, .(wins = .N), by = TeamName][order(-wins)][1:20,] %>%
    ggplot(aes(x = reorder(TeamName, wins), y = wins,label = wins)) + geom_text(color="white", size=2) +
    geom_point(stat = 'identity', fill = 'darkblue',size = 4) + geom_segment(aes(y=0,x = reorder(TeamName,wins), yend = wins,xend =reorder(TeamName, wins) ),color = 'black') +
    labs(x = '', y = 'Wins', title = 'Tournament Wins since 1985') +
    coord_flip()

g4 <-
    tour_compact_res[teams][DayNum == 154, .(wins = .N), by = TeamName][order(-wins)][0:20,] %>%
    ggplot(aes(x = reorder(TeamName, wins), y = wins,label = wins)) + geom_text(color="white", size=2)+ geom_point(stat = 'identity', fill = 'darkblue',size = 4) +
    geom_segment(aes(y=0,x = reorder(TeamName,wins), yend = wins,xend =reorder(TeamName, wins) ),color = 'black') +
    labs(x = '', y = 'Championships', title = 'Tournament Championships since 1985') +
    coord_flip()

grid.arrange(g1, g2, g3, g4, nrow = 2)
## Warning: Removed 3 rows containing missing values (geom_text).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_segment).

How conferences compare with each other in terms of winning championships

## Conferences analysis
conf_analysis <- tour_compact_res[team_conferences, on = c(WTeamID = 'TeamID', 'Season'), nomatch = 0
             ][DayNum == 154, .(ConfAbbrev, wins = .N), by = ConfAbbrev
               ][conferences, on = 'ConfAbbrev', nomatch = 0]
conf_analysis$overall_type <- ifelse(conf_analysis$wins < 4, "below", "above")
conf_analysis <- conf_analysis[order(conf_analysis$wins),]
conf_analysis$`Description` <- factor(conf_analysis$`Description`, levels = conf_analysis$`Description`)

ggplot(conf_analysis, aes(x=`Description`, y=wins, label=wins)) +
  geom_point(stat='identity', aes(col=overall_type), size=6)  +
  scale_color_manual(name="Champion wins",
                     labels = c("Above Average", "Below Average"),
                     values = c("above"="#00ba38", "below"="#f8766d")) +
  geom_text(color="white", size=2) +
  labs(title="Diverging Dot Plot",
       subtitle="Champion Wins by Conferences") +
  ylim(0, 12) +
  coord_flip()

Cross validation Modeling

We use 10-fold cross validation to select and build regularized logistic models with glmnet function. There are two types of common regularized logstic models: Lasso and Ridge. A hybrid of the two regularized logstic model is elastincnet model whose mixture level can be controlled with Alpha pamameter. The other parameter is Lambda which is the coefficent to the penalty added to the model loss function. More details of glmnet use can be found at: https://www.rdocumentation.org/packages/glmnet/versions/2.0-16/topics/glmnet

# Fill NA values with 0, the glmnet function doesn't handle with NA values, so it is critical
# to remove or fill NA values
model_df[is.na(model_df)] <- 0 

# Since we are predicting  classificationm, we use a 2 level factor as the outcome column.
model_df$team1win <- as.factor(model_df$team1win)

# We split 80% of the data into training, and 20% for testing
y <- model_df[,"team1win"]
train_set <- createDataPartition(y, p = 0.8, list = FALSE)
data_train <- model_df[train_set, ]
data_test <- model_df[-train_set, ]

# We define several alpha and lamda hyperparameter valeus to be cross validated evaluated
glmnet_grid <- expand.grid(alpha = c(0,  .1,  .2, .4, .6, .8, 1),
                           lambda = seq(.01, .2, length = 20))
# CV 10 fold
glmnet_ctrl <- trainControl(method = "cv", number = 10)

# Fit the model using the hyperparameter candidates and cv.
# Notice that we removed some variables from the predictors 
# 'Season','D_t2Num','team1id','team2id' are removed since they don't have predictive powers
# 'diff_rank','FTM_t1','FTM_t2','Res_t2','Res_t1' are removed because they can be linearlly inferred #more or less with other predictors, this is called mulicollinearity which must be removed in any #linear based model
glmnet_fit <- train(team1win ~ ., data = data_train%>%select(-c('Season','D_t2Num','team1id','team2id','diff_rank','FTM_t1','FTM_t2','Res_t2','Res_t1')),
                    method = "glmnet",
                    preProcess = c("center", "scale"),
                    tuneGrid = glmnet_grid,
                    trControl = glmnet_ctrl,family = 'binomial')


glmnet_fit
## glmnet 
## 
## 860 samples
##  42 predictor
##   2 classes: '0', '1' 
## 
## Pre-processing: centered (42), scaled (42) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 774, 774, 774, 774, 774, 774, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda  Accuracy   Kappa     
##   0.0    0.01    0.7162791  0.43253366
##   0.0    0.02    0.7162791  0.43253366
##   0.0    0.03    0.7116279  0.42311315
##   0.0    0.04    0.7104651  0.42061856
##   0.0    0.05    0.7116279  0.42291142
##   0.0    0.06    0.7093023  0.41824263
##   0.0    0.07    0.7081395  0.41594729
##   0.0    0.08    0.7139535  0.42756592
##   0.0    0.09    0.7127907  0.42520518
##   0.0    0.10    0.7127907  0.42515479
##   0.0    0.11    0.7116279  0.42280922
##   0.0    0.12    0.7081395  0.41586750
##   0.0    0.13    0.7081395  0.41592501
##   0.0    0.14    0.7081395  0.41592501
##   0.0    0.15    0.7058140  0.41126570
##   0.0    0.16    0.7058140  0.41126570
##   0.0    0.17    0.7058140  0.41126570
##   0.0    0.18    0.7069767  0.41356123
##   0.0    0.19    0.7093023  0.41810141
##   0.0    0.20    0.7069767  0.41345041
##   0.1    0.01    0.7197674  0.43944483
##   0.1    0.02    0.7104651  0.42074226
##   0.1    0.03    0.7093023  0.41845717
##   0.1    0.04    0.7116279  0.42309600
##   0.1    0.05    0.7104651  0.42070022
##   0.1    0.06    0.7127907  0.42544718
##   0.1    0.07    0.7127907  0.42549770
##   0.1    0.08    0.7174419  0.43479515
##   0.1    0.09    0.7162791  0.43233411
##   0.1    0.10    0.7174419  0.43474753
##   0.1    0.11    0.7174419  0.43480280
##   0.1    0.12    0.7139535  0.42781129
##   0.1    0.13    0.7093023  0.41850393
##   0.1    0.14    0.7081395  0.41615053
##   0.1    0.15    0.7093023  0.41845078
##   0.1    0.16    0.7104651  0.42080144
##   0.1    0.17    0.7104651  0.42086171
##   0.1    0.18    0.7104651  0.42086171
##   0.1    0.19    0.7081395  0.41619517
##   0.1    0.20    0.7093023  0.41855078
##   0.2    0.01    0.7174419  0.43476642
##   0.2    0.02    0.7116279  0.42310082
##   0.2    0.03    0.7127907  0.42545998
##   0.2    0.04    0.7127907  0.42555019
##   0.2    0.05    0.7162791  0.43243179
##   0.2    0.06    0.7093023  0.41836520
##   0.2    0.07    0.7116279  0.42314413
##   0.2    0.08    0.7081395  0.41621547
##   0.2    0.09    0.7069767  0.41398054
##   0.2    0.10    0.7081395  0.41630806
##   0.2    0.11    0.7081395  0.41630820
##   0.2    0.12    0.7081395  0.41630820
##   0.2    0.13    0.7093023  0.41859618
##   0.2    0.14    0.7104651  0.42089166
##   0.2    0.15    0.7046512  0.40913068
##   0.2    0.16    0.7058140  0.41142113
##   0.2    0.17    0.7058140  0.41143621
##   0.2    0.18    0.7034884  0.40677202
##   0.2    0.19    0.7034884  0.40670404
##   0.2    0.20    0.7046512  0.40899209
##   0.4    0.01    0.7116279  0.42312587
##   0.4    0.02    0.7104651  0.42096235
##   0.4    0.03    0.7081395  0.41605441
##   0.4    0.04    0.7034884  0.40678730
##   0.4    0.05    0.7069767  0.41397283
##   0.4    0.06    0.7058140  0.41160157
##   0.4    0.07    0.7081395  0.41624817
##   0.4    0.08    0.7046512  0.40916522
##   0.4    0.09    0.7058140  0.41140300
##   0.4    0.10    0.7034884  0.40674685
##   0.4    0.11    0.7011628  0.40203488
##   0.4    0.12    0.7000000  0.39974949
##   0.4    0.13    0.7000000  0.39962385
##   0.4    0.14    0.7011628  0.40192161
##   0.4    0.15    0.7011628  0.40193869
##   0.4    0.16    0.7011628  0.40194124
##   0.4    0.17    0.7011628  0.40194124
##   0.4    0.18    0.7023256  0.40431513
##   0.4    0.19    0.7034884  0.40654282
##   0.4    0.20    0.7046512  0.40878271
##   0.6    0.01    0.7116279  0.42323765
##   0.6    0.02    0.7034884  0.40688051
##   0.6    0.03    0.7023256  0.40461504
##   0.6    0.04    0.7069767  0.41404086
##   0.6    0.05    0.7093023  0.41846299
##   0.6    0.06    0.7058140  0.41145838
##   0.6    0.07    0.6988372  0.39751711
##   0.6    0.08    0.7000000  0.39975722
##   0.6    0.09    0.6976744  0.39505288
##   0.6    0.10    0.7000000  0.39963613
##   0.6    0.11    0.7023256  0.40431513
##   0.6    0.12    0.7046512  0.40896609
##   0.6    0.13    0.7034884  0.40655223
##   0.6    0.14    0.7000000  0.39950294
##   0.6    0.15    0.7023256  0.40395497
##   0.6    0.16    0.7034884  0.40618010
##   0.6    0.17    0.7046512  0.40814743
##   0.6    0.18    0.7023256  0.40324587
##   0.6    0.19    0.6988372  0.39578035
##   0.6    0.20    0.6965116  0.39096724
##   0.8    0.01    0.7116279  0.42324274
##   0.8    0.02    0.6988372  0.39761054
##   0.8    0.03    0.7058140  0.41167516
##   0.8    0.04    0.7023256  0.40452197
##   0.8    0.05    0.6976744  0.39516907
##   0.8    0.06    0.7000000  0.39977453
##   0.8    0.07    0.6988372  0.39734327
##   0.8    0.08    0.7011628  0.40194105
##   0.8    0.09    0.7034884  0.40655223
##   0.8    0.10    0.7023256  0.40414898
##   0.8    0.11    0.7011628  0.40166683
##   0.8    0.12    0.7011628  0.40133184
##   0.8    0.13    0.7034884  0.40535926
##   0.8    0.14    0.6976744  0.39339440
##   0.8    0.15    0.6976744  0.39326988
##   0.8    0.16    0.6976744  0.39326988
##   0.8    0.17    0.7011628  0.39988645
##   0.8    0.18    0.6941860  0.38530272
##   0.8    0.19    0.6790698  0.35410130
##   0.8    0.20    0.6709302  0.33683580
##   1.0    0.01    0.7058140  0.41160732
##   1.0    0.02    0.6988372  0.39767869
##   1.0    0.03    0.7093023  0.41858603
##   1.0    0.04    0.6976744  0.39532230
##   1.0    0.05    0.6976744  0.39519082
##   1.0    0.06    0.7000000  0.39970863
##   1.0    0.07    0.6988372  0.39719159
##   1.0    0.08    0.7011628  0.40179510
##   1.0    0.09    0.7000000  0.39904645
##   1.0    0.10    0.7011628  0.40054378
##   1.0    0.11    0.6976744  0.39339186
##   1.0    0.12    0.6965116  0.39095716
##   1.0    0.13    0.6976744  0.39328181
##   1.0    0.14    0.6918605  0.38096251
##   1.0    0.15    0.6802326  0.35683738
##   1.0    0.16    0.6709302  0.33712333
##   1.0    0.17    0.6651163  0.32358741
##   1.0    0.18    0.6267442  0.24405011
##   1.0    0.19    0.5511628  0.08393029
##   1.0    0.20    0.5116279  0.00000000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0.1 and lambda = 0.01.

Plot how the model accuracy changes with the mixing percentage(alpha parameter) and regularization parameter(lambda parameter)

plot(glmnet_fit, xvar = "dev", label = TRUE,scales = list(x = list(log = 2)))

We wanted to see the coefficients of the predictors of the best model, and in our cross validation experiments, there are two sets of hyperarameter sets got us the best the accuracy, and thus we have two equivalent best models, two sets of coefficients as below shows

# Best model parameters:
best_param <- glmnet_fit$results %>% filter(glmnet_fit$results$Accuracy ==max(glmnet_fit$results$Accuracy)) 

best_fit <- glmnet(as.matrix(data_train%>%select(-c('Season','D_t2Num','team1win','team1id','team2id','diff_rank','FTM_t1','FTM_t2','Res_t2','Res_t1'))),data_train$team1win,lambda = best_param$lambda, alpha =  best_param$alpha,family="binomial")
# Coeffcients
coef(best_fit)
## 43 x 1 sparse Matrix of class "dgCMatrix"
##                        s0
## (Intercept)  3.919146e+00
## t1_rank     -5.120394e-02
## t1_rank_n   -7.184733e-02
## t2_rank      6.027526e-02
## t2_rank_n    4.977675e-02
## Score_t1    -3.426987e-03
## FGM_t1       5.107888e-02
## FGA_t1       .           
## FGM3_t1      3.070900e-02
## FGA3_t1      .           
## FTA_t1      -9.996756e-02
## OR_t1        1.600694e-01
## DR_t1        .           
## Ast_t1       1.042594e-02
## TO_t1       -1.181789e-01
## Stl_t1      -3.494907e-02
## Blk_t1      -9.571860e-03
## PF_t1       -1.005106e-02
## FGP_t1       6.068800e+00
## FGP2_t1     -1.139388e+00
## FGP3_t1     -3.101473e+00
## FTP_t1       2.832346e+00
## Outcome_t1   3.895055e-01
## Score_t2    -2.390381e-02
## FGM_t2       .           
## FGA_t2       4.339420e-02
## FGM3_t2      .           
## FGA3_t2      1.149702e-02
## FTA_t2       3.630485e-02
## OR_t2       -1.530429e-01
## DR_t2        9.942938e-03
## Ast_t2       1.508885e-01
## TO_t2        1.194181e-01
## Stl_t2      -1.139296e-01
## Blk_t2      -1.763140e-01
## PF_t2       -9.864318e-03
## FGP_t2      -6.388998e+00
## FGP2_t2     -7.457883e+00
## FGP3_t2      5.173612e+00
## FTP_t2      -2.461668e+00
## Outcome_t2  -2.181682e+00
## Distance_t1 -2.418028e-07
## Distance_t2 -2.560409e-08

Besides glmnet, cv.glment is also a commonly used logistic function to build models automatically using cross validation. For logistic regression, cv.glmnet has similar arguments and usage as Gaussian. nfolds, weights, lambda, parallel are all available to users.

The following model uses misclassification error as the criterion for 10-fold cross-validation, and then we plot the object and show the optimal values of λ .

cvfit <- cv.glmnet(as.matrix(data_train%>%select(-c('Season','D_t2Num','team1win','team1id','team2id','diff_rank','FTM_t1','FTM_t2','Res_t2','Res_t1'))),data_train$team1win,family="binomial", type.measure = "class")
plot(cvfit)

Model evaluation

Make predictions on the held test data set and produces the probability of team1winnin prediction

pred_prob <- predict(glmnet_fit, newdata = data_test, type = 'prob')

Decide on optimal prediction probability cutoff for the model. The default cutoff prediction probability score is 0.5 or the ratio of 1’s and 0’s in the training data. But sometimes, tuning the probability cutoff can improve the accuracy in both the development and validation samples. The InformationValue::optimalCutoff function provides ways to find the optimal cutoff to improve the prediction of 1’s, 0’s, both 1’s and 0’s and o reduce the misclassification error. Lets compute the optimal score that minimizes the misclassification error for the above model.

optCutOff <- optimalCutoff(data_test$team1win, pred_prob[2])[1] #0.4785712

A confusion matrix is a table that is often used to describe the performance of a classification model (or “classifier”) on a set of test data for which the true values are known.

confusionMatrix(data_test$team1win, pred_prob[2],threshold = optCutOff)
##    0  1
## 0 91 43
## 1 18 61

ROC

Receiver Operating Characteristics Curve traces the percentage of true positives accurately predicted by a given logit model as the prediction probability cutoff is lowered from 1 to 0. For a good model, as the cutoff is lowered, it should mark more of actual 1’s as positives and lesser of actual 0’s as 1’s. So for a good model, the curve should rise steeply, indicating that the TPR (Y-Axis) increases faster than the FPR (X-Axis) as the cutoff score decreases. Greater the area under the ROC curve, better the predictive ability of the model.

plotROC(data_test$team1win, pred_prob[2])

Compute distance bewteen game location to each team’s home venue

skip for now, need to know how the home venue location was generated for each team

#distm(c(lon1, lat1), c(lon2, lat2), fun = distHaversine)